dir = 'results/pangenome/Bacteroides_xylanisolvens_outgroup'
system(paste0('mkdir -pv ',file.path(dir,'analyses')))
3 Bovatus genomes included in roary run
Raxml with 100 bootstraps run on tree
metadata <- read.table('metadata/strain_ncbi_metadata_assembly_all.txt',header=T)
metadata <- metadata %>%
mutate(taxonomy_Species = str_replace_all(taxonomy_Species,' ','_'),
isolate = str_replace_all(isolate,'[_-]','.'),
site = recode(site,'USA: Cambridge'='USA','USA:Boston'='USA',
'China: Shenzhen'='China','USA:Seattle'='USA',
'USA:Baltimore'='USA', 'not applicable'='siteUnknown',
'missing'='siteUnknown')) %>%
unite(host_site, host, site, sep = "_", remove = FALSE)
#Bxy roary was run with exclude lower quality assemblies
#completeness and contamination associated with genome size and # of predicted genes
Bxy_metadata <- metadata %>% filter(taxonomy_Species == 'Bacteroides_xylanisolvens')
summary(lm_robust(Genome.size ~ Contamination + Completeness, data=Bxy_metadata))
##
## Call:
## lm_robust(formula = Genome.size ~ Contamination + Completeness,
## data = Bxy_metadata)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 513870 1341840 0.383 7.024e-01 -2141190 3168930 128
## Contamination 46545 2377 19.583 2.538e-40 41842 51248 128
## Completeness 58326 13597 4.290 3.498e-05 31422 85231 128
##
## Multiple R-squared: 0.3116 , Adjusted R-squared: 0.3008
## F-statistic: 220.2 on 2 and 128 DF, p-value: < 2.2e-16
summary(lm_robust(X..predicted.genes ~ Contamination+ Completeness, data=Bxy_metadata))
##
## Call:
## lm_robust(formula = X..predicted.genes ~ Contamination + Completeness,
## data = Bxy_metadata)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 8676.61 2046.114 4.241 4.239e-05 4628.02 12725.191 128
## Contamination 56.39 8.462 6.663 7.184e-10 39.64 73.129 128
## Completeness -35.46 20.670 -1.716 8.864e-02 -76.36 5.435 128
##
## Multiple R-squared: 0.2168 , Adjusted R-squared: 0.2046
## F-statistic: 24.99 on 2 and 128 DF, p-value: 6.874e-10
#apply qc filter
Bxy_metadata_filtered <- metadata %>%
filter(taxonomy_Species == 'Bacteroides_xylanisolvens') %>%
filter(Completeness>94&Contamination<5)
summary(lm_robust(Genome.size ~ Contamination + Completeness, data=Bxy_metadata_filtered))
##
## Call:
## lm_robust(formula = Genome.size ~ Contamination + Completeness,
## data = Bxy_metadata_filtered)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 2062548 2037963 1.012 0.31353 -1972137 6097232 121
## Contamination 44173 42699 1.035 0.30295 -40361 128708 121
## Completeness 42709 20542 2.079 0.03972 2041 83378 121
##
## Multiple R-squared: 0.01709 , Adjusted R-squared: 0.0008399
## F-statistic: 2.161 on 2 and 121 DF, p-value: 0.1196
summary(lm_robust(X..predicted.genes ~ Contamination + Completeness, data=Bxy_metadata_filtered))
##
## Call:
## lm_robust(formula = X..predicted.genes ~ Contamination + Completeness,
## data = Bxy_metadata_filtered)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 23531.52 3780.23 6.225 7.199e-09 16047.6 31015.47 121
## Contamination -19.03 58.73 -0.324 7.465e-01 -135.3 97.24 121
## Completeness -184.88 38.04 -4.860 3.564e-06 -260.2 -109.56 121
##
## Multiple R-squared: 0.1735 , Adjusted R-squared: 0.1598
## F-statistic: 14.11 on 2 and 121 DF, p-value: 3.096e-06
#completeness is negatively associated with # of predicted genes, s
ggplot(Bxy_metadata_filtered,aes(x=Completeness,y=X..predicted.genes)) +
geom_point() + theme_bw()
samples_to_exclude <- metadata %>%
filter(Completeness<94|Contamination>5)
samples_to_exclude <- samples_to_exclude$isolate
metadata <- metadata %>%
filter(Completeness>94&Contamination<5)
Bxy_tree_file <- "phylogeny/RAxML_bipartitions.Bacteroides_xylanisolvens_outgroup"
Bxy_tree <- ape::read.tree(file.path(dir,Bxy_tree_file))
Bxy_tree$tip.label <- str_replace_all(Bxy_tree$tip.label,'[_-]','.') #clean up names
Bxy_tree <- drop.tip(Bxy_tree,samples_to_exclude) #remove low QC assemblies
setdiff(Bxy_tree$tip.label,metadata$isolate)
## character(0)
Bxy_metadata <- metadata %>% filter(isolate %in% Bxy_tree$tip.label)
write.table(Bxy_metadata,file=file.path(dir,'analyses/Bxy_metadata.txt'))
ggtree(Bxy_tree) + geom_nodelab()
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `mutate_()` is deprecated as of dplyr 0.7.0.
## Please use `mutate()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Bov <- metadata %>% filter(taxonomy_Species == 'Bacteroides_ovatus')
Bov_outgroup <- intersect(Bxy_tree$tip.label,Bov$isolate)
Bxy_tree <- root(Bxy_tree,outgroup=Bov_outgroup)
get_color_palette <- function(tips) {
Bxy_metadata <- Bxy_metadata %>% filter(isolate %in% tips)
vec <- sort(unique(Bxy_metadata$host_site))
return(recode(vec,
'human_USA'='blue3',
'human_China' = 'cyan',
'bonobo_Columbus_Zoo'='red2',
'chimpanzee_Houston_Zoo'='orange2',
'orangutan_Houston_Zoo'='purple4',
'gorilla_Columbus_Zoo'='green3',
'chicken_siteUnknown'='tan',
'missing_siteUnknown'='brown4'))}
host_site_color <- get_color_palette(Bxy_tree$tip.label)
Bxy_tree_plot <- ggtree(Bxy_tree) %<+% Bxy_metadata +
geom_nodepoint(aes(subset = suppressWarnings(as.numeric(label)) > 50),size=.75) +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(Bxy_tree$tip.label)) +
geom_tiplab() +
xlim(NA,.5)
Bxy_tree_plot
write.tree(Bxy_tree,file=file.path(dir,'analyses/Bxy.tre'))
ggsave(Bxy_tree_plot,file=file.path(dir,'analyses/Bxy_phylogeny.pdf'),height=15,width=8)
cladeA_MCRA <- ape::getMRCA(Bxy_tree,c('GCA.009102525.1.ASM910252v1','GCA.009093695.1.ASM909369v1'))
cladeA_tree <- extract.clade(Bxy_tree,cladeA_MCRA)
ggtree(cladeA_tree) %<+% metadata +
geom_nodepoint(aes(subset = suppressWarnings(as.numeric(label)) > 50),size=.75) +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(cladeA_tree$tip.label)) +
geom_tiplab() +
xlim(NA,.5)
cladeB_MCRA <- ape::getMRCA(Bxy_tree,c('P17.D4','GCA.003464445.1.ASM346444v1'))
cladeB_tree <- extract.clade(Bxy_tree,cladeB_MCRA)
ggtree(cladeB_tree) %<+% metadata +
geom_nodepoint(aes(subset = suppressWarnings(as.numeric(label)) > 50),size=.75) +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(cladeB_tree$tip.label)) +
geom_tiplab() +
xlim(NA,.5)
cladeC_MCRA <- ape::getMRCA(Bxy_tree,c('P21.4E','GCA.009102685.1.ASM910268v1'))
cladeC_tree <- extract.clade(Bxy_tree,cladeC_MCRA)
ggtree(cladeC_tree) %<+% metadata +
geom_nodepoint(aes(subset = suppressWarnings(as.numeric(label)) > 50),size=.75) +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(cladeC_tree$tip.label)) +
geom_tiplab() +
xlim(NA,.5)
Bxy_metadata <- Bxy_metadata %>% mutate(clade = ifelse(
isolate %in% cladeA_tree$tip.label,'cladeA',
ifelse(isolate %in% cladeB_tree$tip.label,'cladeB',
ifelse(isolate %in% cladeC_tree$tip.label,'cladeC','unassigned'))))
captive1Node <- ape::getMRCA(Bxy_tree,c('P17.D4','P13.A9'))
captive1tree <- extract.clade(Bxy_tree,captive1Node)
captive2Node <- ape::getMRCA(Bxy_tree,c('P21.4E','P21.11C'))
captive2tree <- extract.clade(Bxy_tree,captive2Node)
captive3Node <- ape::getMRCA(Bxy_tree,c('P21.6E','P21.2G'))
captive3tree <- extract.clade(Bxy_tree,captive3Node)
Bxy_tree_plot_cladelabels <- ggtree(Bxy_tree) %<+%
Bxy_metadata +
geom_nodepoint(aes(subset = suppressWarnings(as.numeric(label)) > 50),size=.75) +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(Bxy_tree$tip.label)) +
xlim(NA,.5)+
geom_cladelabel(node=captive1Node, color='black', offset=.01,
label="Mixed-host captive clade") +
geom_cladelabel(node=captive2Node, color='black', offset=.01,label="Captive gorilla clade2") +
geom_cladelabel(node=captive3Node, color='black', offset=.01,label="Captive gorilla clade1") +
geom_cladelabel(node=cladeA_MCRA, color='black', offset=.02,label="CladeA") +
geom_cladelabel(node=cladeB_MCRA, color='black', offset=.22,label="CladeB") +
geom_cladelabel(node=cladeC_MCRA, color='black', offset=.2,label="CladeC") +
xlim(NA,.5)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
Bxy_tree_plot_cladelabels
ggsave(Bxy_tree_plot_cladelabels,
file=file.path(dir,'analyses/Bxy_phylogeny_cladeLabels.pdf'),height=15,width=8)
Bxy_metadata <- Bxy_metadata %>% mutate(captive_clade = ifelse(
isolate %in% captive1tree$tip.label,'mixed.hostB',
ifelse(isolate %in% captive2tree$tip.label,'gorillaB',
ifelse(isolate %in% captive3tree$tip.label,'gorillaC','unassigned'))))
#aln <- read.alignment(file.path(dir,'roary/core_gene_alignment.aln'),format='fasta')
#alnd <- as.matrix(dist.alignment(aln, matrix = "similarity",gap=0))
#colnames(alnd) <- str_replace_all(colnames(alnd),'[_-]','.')
#rownames(alnd) <- str_replace_all(rownames(alnd),'[_-]','.')
#alnd <- alnd[Bxy_metadata$isolate,Bxy_metadata$isolate]
#write.table(alnd,file = file.path(dir,'analyses/core_gene_alignment_dist.txt'),sep='\t')
alnd <- read.table(file = file.path(dir,'analyses/core_gene_alignment_dist.txt'),sep='\t')
hc <- hclust(dist(alnd))
cluster99 <- cutree(hc,h=.01)
cluster99 <- as.data.frame(cluster99) %>% rownames_to_column(var='isolate')
cluster999 <- cutree(hc,h=.001)
cluster999 <- as.data.frame(cluster999) %>% rownames_to_column(var='isolate')
Bxy_metadata <- Bxy_metadata %>% left_join(cluster99,by='isolate') %>% left_join(cluster999,by='isolate')
write.table(Bxy_metadata,file=file.path(dir,'analyses/Bxy_metadata.txt'),quote=F,sep='\t')
Use online dbCAN metada server (http://bcb.unl.edu/dbCAN2/index.php) to annotate pangenome reference produced by roary. Split files to meet size requirements. Run against dbCAN, CAZy, PPR.
pan_genome_fasta <- read.fasta(file.path(dir,'roary/pan_genome_reference.fa'),whole.header = T)
nseqs = length(pan_genome_fasta)
half= nseqs/2
part1 = pan_genome_fasta[1:half]
part2 = pan_genome_fasta[(half+1):nseqs]
system(paste0('mkdir -pv ',file.path(dir,'analyses/annotation')))
seqinr::write.fasta(part1,names=names(part1),
file=file.path(dir,'analyses/annotation/pan_genome_reference_part1.fa'))
seqinr::write.fasta(part2,names=names(part2),
file=file.path(dir,'analyses/annotation/pan_genome_reference_part2.fa'))
Download results from dbCAN meta server
#wget http://bcb.unl.edu/dbCAN2/data/blast/20210319105041/overview.txt -O dbDCAN_part1.txt
#wget http://bcb.unl.edu/dbCAN2/data/blast/20210319111110/overview.txt -O dbDCAN_part2.txt
#system(paste0('mv dbDCAN_part1.txt ',file.path(dir,'analyses/annotation/dbDCAN_part1.txt')))
#system(paste0('mv dbDCAN_part2.txt ',file.path(dir,'analyses/annotation/dbDCAN_part2.txt')))
#read in DBCan
dbcan_p1 <- read.table(file.path(dir,'analyses/annotation/dbDCAN_part1.txt'),sep='\t',header=T)
dbcan_p2 <- read.table(file.path(dir,'analyses/annotation/dbDCAN_part2.txt'),sep='\t',header=T)
head(dbcan_p1) #dbCAN split fasta sequence header so have to add back the Gene ortholog
## Gene.ID HMMER Hotpep DIAMOND Signalp X
## 1 BALBAENG_00590_1 GH33(1-206) GH33 GH33 N 3
## 2 BALBAENG_00617_1 GH76(33-367) GH76 GH76 Y(1-35) 3
## 3 BALBAENG_00619_1 GH92(3-206) GH92 GH92 N 3
## 4 BALBAENG_00625_1 GH92(1-429) GH92 GH92 N 3
## 5 BALBAENG_00627_1 GH76(33-367) GH76 GH76 Y(1-35) 3
## 6 BALBAENG_00628_1 GH76(14-329) GH76 GH76 N 3
dbcan <- dbcan_p1 %>%
add_row(dbcan_p2) %>% #add part2 to end of part2
mutate(Gene.ID=str_sub(Gene.ID, end=-3)) %>% #remove '_1' from end of GeneID
mutate_all(as.character)
header <- names(pan_genome_fasta)
dbcan <- as.data.frame(header) %>%
separate(header,sep=' ',into=c('Gene.ID','Gene')) %>%
left_join(dbcan,by='Gene.ID')
head(dbcan)
## Gene.ID Gene HMMER Hotpep DIAMOND Signalp X
## 1 HFONCOPK_00001 group_13236 <NA> <NA> <NA> <NA> <NA>
## 2 HFONCOPK_00002 group_9967 <NA> <NA> <NA> <NA> <NA>
## 3 HFONCOPK_00003 degA <NA> <NA> <NA> <NA> <NA>
## 4 HFONCOPK_00004 rbsK <NA> <NA> <NA> <NA> <NA>
## 5 HFONCOPK_00005 group_2728 <NA> <NA> <NA> <NA> <NA>
## 6 HFONCOPK_00006 susD_3 <NA> <NA> <NA> <NA> <NA>
write.table(dbcan,file.path(dir,'analyses/annotation/dbDCAN_final.txt'),sep='\t',quote=F)
pres_abs <- read.csv(file.path(dir,'roary/gene_presence_absence.csv'))
colnames(pres_abs) <- str_replace_all(colnames(pres_abs),'[_-]','.')
setdiff(Bxy_metadata$isolate,colnames(pres_abs)) #any isolates missing?
## character(0)
isblank <- function(x) (ifelse(x=="",0,1))
pres_abs <- pres_abs %>%
select(Gene,Annotation,Bxy_metadata$isolate) %>%
mutate_at(vars(-Gene,-Annotation),isblank)
dbcan_isolates <- dbcan %>%
left_join(pres_abs,by='Gene') %>%
filter(!is.na(DIAMOND)) %>%
mutate(consensus = ifelse(DIAMOND != 'N',DIAMOND,
ifelse(HMMER != 'N',HMMER,Hotpep))) %>%
separate(consensus,sep='[(_]',into = c('consensus'),extra = 'drop')
dbcan_isolates <- dbcan_isolates %>%
pivot_longer(cols=Bxy_metadata$isolate,
names_to='isolate',values_to='present') %>%
filter(present == 1) %>%
group_by(isolate,consensus) %>%
tally()
summary_of_CAZY_domains <- dbcan_isolates %>%
group_by(consensus) %>%
summarise(total = sum(n)) %>%
arrange(-total)
## `summarise()` ungrouping output (override with `.groups` argument)
HOST_table <- Bxy_metadata %>%
select(isolate,host_site) %>%
column_to_rownames(var='isolate')
CBM_table <- dbcan_isolates %>%
as.data.frame() %>%
dplyr::filter(stringr::str_starts(consensus, "CBM")) %>%
pivot_wider(names_from=consensus,values_from=n,values_fill=0) %>%
column_to_rownames(var='isolate')
head(CBM_table)
## CBM0+CBM4+GH10 CBM13 CBM20+GH77 CBM32 CBM32+GH2
## GCA.002161115.1.ASM216111v1 1 1 1 2 2
## GCA.002161135.1.ASM216113v1 1 1 1 2 1
## GCA.002959635.1.ASM295963v1 0 0 1 2 1
## GCA.003436085.1.ASM343608v1 1 1 1 2 2
## GCA.003437545.1.ASM343754v1 0 1 1 1 2
## GCA.003458755.1.ASM345875v1 0 1 1 1 2
## CBM32+GH35 CBM35+GH27 CBM48+CE1+CE6 CBM48+GH13 CBM5
## GCA.002161115.1.ASM216111v1 2 1 1 1 3
## GCA.002161135.1.ASM216113v1 2 1 1 1 3
## GCA.002959635.1.ASM295963v1 2 1 1 1 2
## GCA.003436085.1.ASM343608v1 1 1 1 1 3
## GCA.003437545.1.ASM343754v1 2 1 1 1 3
## GCA.003458755.1.ASM345875v1 4 1 1 2 3
## CBM50 CBM50+GH23 CBM50+GH73 CBM57+GH137+GH2
## GCA.002161115.1.ASM216111v1 5 1 1 1
## GCA.002161135.1.ASM216113v1 5 1 1 1
## GCA.002959635.1.ASM295963v1 6 1 1 1
## GCA.003436085.1.ASM343608v1 5 1 1 1
## GCA.003437545.1.ASM343754v1 5 1 1 1
## GCA.003458755.1.ASM345875v1 5 1 1 1
## CBM57+GH2 CBM6 CBM6+GH43 CBM62 CBM67+GH0+GH78 CBM56
## GCA.002161115.1.ASM216111v1 2 1 2 1 1 0
## GCA.002161135.1.ASM216113v1 2 2 2 0 1 1
## GCA.002959635.1.ASM295963v1 2 2 6 1 0 1
## GCA.003436085.1.ASM343608v1 2 1 2 0 1 0
## GCA.003437545.1.ASM343754v1 2 0 5 0 0 1
## GCA.003458755.1.ASM345875v1 5 2 2 3 0 0
## CBM32+GH43 CBM35 CBM35+GH98 CBM4+GH10 CBM57+CBM6
## GCA.002161115.1.ASM216111v1 0 0 0 0 0
## GCA.002161135.1.ASM216113v1 0 0 0 0 0
## GCA.002959635.1.ASM295963v1 1 2 1 1 0
## GCA.003436085.1.ASM343608v1 0 0 0 0 0
## GCA.003437545.1.ASM343754v1 0 0 1 0 1
## GCA.003458755.1.ASM345875v1 0 0 2 0 0
## CBM67+GH78 CBM66 CBM2 CBM61+GH53 CBM0
## GCA.002161115.1.ASM216111v1 0 0 0 0 0
## GCA.002161135.1.ASM216113v1 0 0 0 0 0
## GCA.002959635.1.ASM295963v1 0 0 0 0 0
## GCA.003436085.1.ASM343608v1 0 0 0 0 0
## GCA.003437545.1.ASM343754v1 1 0 0 0 0
## GCA.003458755.1.ASM345875v1 1 0 0 0 0
gheatmap(ggtree(Bxy_tree),CBM_table, width=.3,
colnames_angle=90, colnames_offset_y = .25) +
scale_fill_viridis_c(option="A", name="continuous\nvalue")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
CAZY_table <- dbcan_isolates %>%
as.data.frame() %>%
pivot_wider(names_from=consensus,values_from=n,values_fill=0) %>%
column_to_rownames(var='isolate')
hc <- hclust(dist(t(CAZY_table)))
plot(hc, hang = -1, cex = 0.6)
cluster <- cutree(hc,k=10)
cazy <- names(c)
df_cazy <- data.frame(cbind(cazy,cluster)) %>% rownames_to_column(var='cazy')
cz<- c('GH2','GT2','GT4')
(cz <- df_cazy %>% filter(cluster>1) %>% pull(cazy))
## [1] "CBM50" "CBM6+GH43" "CE1" "GH0" "GH10" "GH105"
## [7] "GH109" "GH115" "GH117" "GH13" "GH130" "GH16"
## [13] "GH18" "GH2" "GH20" "GH28" "GH29" "GH3"
## [19] "GH31" "GH33" "GH36" "GH43" "GH5" "GH50"
## [25] "GH51" "GH76" "GH78" "GH88" "GH92" "GH95"
## [31] "GH97" "GT0" "GT2" "GT4" "GT51" "PL1"
cazy_sd <- apply(t(CAZY_table), 1, sd, na.rm=TRUE)
cz <- names(sort(cazy_sd,decreasing = TRUE)[1:10])
select_CAZY <- CAZY_table %>% select(cz)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cz)` instead of `cz` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
p <- gheatmap(ggtree(Bxy_tree) + ylim(-10,NA),
HOST_table,width=.3,colnames_angle=90,hjust=1) + scale_fill_manual(values=get_color_palette(Bxy_tree$tip.label))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
library(ggnewscale)
p <- p + new_scale_fill()
p2 <- gheatmap(p,select_CAZY,offset=.1,colnames_angle=90,hjust=1) + scale_fill_viridis_c(option="C")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2
ggsave(p2,file=file.path(dir,'analyses/cazy_heatmap.pdf'),height=6)
## Saving 7 x 6 in image
eggnog <- readr::read_tsv(file.path(dir,'eggnog_mapper/pan_genome_reference.emapper.annotations'),comment = '##')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## evalue = col_double(),
## score = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
eggnog <- eggnog %>% rename('query'='#query') %>% select(query,best_OG_cat,best_OG_name,best_OG_desc)
table(eggnog$best_OG_cat)
##
## - A B C CE CG CH CO CP D
## 4365 1 1 520 1 6 14 60 11 284
## DJ DK DM DT DZ E EF EG EGP EH
## 5 2 26 1 29 647 11 46 36 22
## EH / C EH / EH EJ EL EM EP EQ ET EU F
## 2 1 8 1 8 2 1 10 9 309
## FG FJ FK FP FT G G / P G / S GK GKT
## 3 9 3 4 1 1846 1 2 14 3
## GM GT H HJ HJM HK HP HQ I IM
## 121 1 637 7 1 1 11 1 207 5
## IQ IU J JKL JM K KL KLT KLT / T KO
## 70 1 401 16 26 1053 14 22 1 1
## KT KTU L L / J L / V LO LT LU M M / G
## 85 1 2510 1 1 2 5 5 1718 1
## M / O M / S MNU MOT MP MU MU / H N NOU NT
## 2 1 14 2 7 59 1 108 1 2
## NU O O / O OU P PT Q Q / - S T
## 25 314 1 20 1284 125 82 2 6025 682
## T / - U UW V
## 2 587 7 492
header <- names(pan_genome_fasta)
Bov <- Bxy_metadata$isolate[Bxy_metadata$taxonomy_Species!="Bacteroides_xylanisolvens"]
Bxy <- Bxy_metadata$isolate[Bxy_metadata$taxonomy_Species=="Bacteroides_xylanisolvens"]
df <- as.data.frame(header) %>%
separate(header,sep=' ',into=c('Gene.ID','Gene')) %>%
left_join(eggnog, by = c('Gene.ID' = 'query')) %>%
left_join(pres_abs,by='Gene') %>%
select(-Bov)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(Bov)` instead of `Bov` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
head(df)
## Gene.ID Gene best_OG_cat best_OG_name
## 1 HFONCOPK_00001 group_13236 <NA> <NA>
## 2 HFONCOPK_00002 group_9967 <NA> <NA>
## 3 HFONCOPK_00003 degA K 4NM06@976|Bacteroidetes
## 4 HFONCOPK_00004 rbsK H 4NENQ@976|Bacteroidetes
## 5 HFONCOPK_00005 group_2728 P 4NDXS@976|Bacteroidetes
## 6 HFONCOPK_00006 susD_3 M 4NE4Y@976|Bacteroidetes
## best_OG_desc
## 1 <NA>
## 2 <NA>
## 3 helix_turn _helix lactose operon repressor
## 4 Catalyzes the phosphorylation of ribose at O-5 in a reaction requiring ATP and magnesium. The resulting D-ribose-5- phosphate can then be used either for sythesis of nucleotides, histidine, and tryptophan, or as a component of the pentose phosphate pathway
## 5 TonB-linked outer membrane protein, SusC RagA family
## 6 RagB, SusD
## Annotation P17.C2 P17.A3 P17.D4 P17.D5 P19.A7
## 1 hypothetical protein 0 0 0 0 0
## 2 hypothetical protein 0 0 0 0 0
## 3 HTH-type transcriptional regulator DegA 0 0 0 0 0
## 4 Ribokinase 0 0 0 0 0
## 5 TonB-dependent receptor SusC 0 0 0 0 0
## 6 hypothetical protein 0 0 0 0 0
## P19.G7 P19.D7.or.C7 P19.10A P19.10B P19.10E P20.6B P20.6G P21.2G P21.4E
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## P21.4F P21.4G P21.6D P21.6E P21.6G P21.11A P21.11C P13.H2 P13.F7 P13.A9
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## P13.G9 P13.H9 P14.C1 P14.A2 P14.G2 P14.A4 P14.B4 P14.E4 P14.F5 P14.B8 P15.A10
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## P15.B10 P15.E10 P15.F10 P15.F11 GCA.015551805.1.ASM1555180v1
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 1
## 5 0 0 0 0 1
## 6 0 0 0 0 1
## GCA.006546965.1.ASM654696v1 GCA.008710235.1.ASM871023v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102765.1.ASM910276v1 GCA.009102605.1.ASM910260v1
## 1 0 0
## 2 0 1
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102705.1.ASM910270v1 GCA.009102805.1.ASM910280v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.002161135.1.ASM216113v1 GCA.009102755.1.ASM910275v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009102675.1.ASM910267v1 GCA.009102825.1.ASM910282v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.003469745.1.ASM346974v1 GCA.004167295.1.ASM416729v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009093775.1.ASM909377v1 GCA.003436085.1.ASM343608v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.009102685.1.ASM910268v1 GCA.009102845.1.ASM910284v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102085.1.ASM910208v1 GCA.009102665.1.ASM910266v1
## 1 0 0
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009101945.1.ASM910194v1 GCA.009102725.1.ASM910272v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009148965.1.ASM914896v1 GCA.009095545.1.ASM909554v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.003437545.1.ASM343754v1 GCA.003474645.1.ASM347464v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.009102015.1.ASM910201v1 GCA.003474245.1.ASM347424v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.003468875.1.ASM346887v1 GCA.015553655.1.ASM1555365v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009102045.1.ASM910204v1 GCA.009102025.1.ASM910202v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.015668785.1.ASM1566878v1 GCA.002161115.1.ASM216111v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.009102075.1.ASM910207v1 GCA.009093695.1.ASM909369v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.015559535.1.ASM1555953v1 GCA.009101535.1.ASM910153v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.015547545.1.ASM1554754v1 GCA.003464445.1.ASM346444v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101785.1.ASM910178v1 GCA.009101565.1.ASM910156v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102225.1.ASM910222v1 GCA.009102105.1.ASM910210v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101635.1.ASM910163v1 GCA.009102625.1.ASM910262v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101555.1.ASM910155v1 GCA.009153275.1.ASM915327v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102465.1.ASM910246v1 GCA.009102135.1.ASM910213v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102505.1.ASM910250v1 GCA.009093945.1.ASM909394v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102325.1.ASM910232v1 GCA.009101725.1.ASM910172v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009103215.1.ASM910321v1 GCA.009102525.1.ASM910252v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102305.1.ASM910230v1 GCA.015556195.1.ASM1555619v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 0 1
## 6 1 1
## GCA.009101615.1.ASM910161v1 GCA.009101775.1.ASM910177v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102485.1.ASM910248v1 GCA.009102185.1.ASM910218v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009153235.1.ASM915323v1 GCA.009101745.1.ASM910174v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101985.1.ASM910198v1 GCA.009102215.1.ASM910221v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102385.1.ASM910238v1 GCA.009101685.1.ASM910168v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 1
## 5 0 1
## 6 1 1
## GCA.009093835.1.ASM909383v1 GCA.009102545.1.ASM910254v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102165.1.ASM910216v1 GCA.009102245.1.ASM910224v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009102285.1.ASM910228v1 GCA.009102785.1.ASM910278v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 1 1
## GCA.009101505.1.ASM910150v1 GCA.009102145.1.ASM910214v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 1
## 5 0 0
## 6 1 1
## GCA.009101625.1.ASM910162v1 GCA.009101665.1.ASM910166v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 0 1
## 6 1 1
## GCA.009102575.1.ASM910257v1 GCA.009102265.1.ASM910226v1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 1
## 5 0 0
## 6 1 1
## GCA.009102375.1.ASM910237v1 GCA.009101705.1.ASM910170v1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 1
## GCA.009101965.1.ASM910196v1 GCA.009101645.1.ASM910164v1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 1
## GCA.003458755.1.ASM345875v1 GCA.003473975.1.ASM347397v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
df$total <- df %>%
select(Bxy) %>%
mutate_all(as.numeric) %>%
rowSums()
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(Bxy)` instead of `Bxy` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
df <- df %>% filter(total > 0) #remove genes not present after excluding Bov
core = length(Bxy)* .95 #set limits for core, shell, cloud genes
shell = length(Bxy)* .15
df <- df %>% mutate(core_cat = if_else(total>core,'core',
if_else(total>shell,'shell','cloud')),
cog1 =str_sub(best_OG_cat, end=1),
best_OG_cat = replace_na(best_OG_cat,'-'))
df_sh <- df %>% select(-Bxy)
#average breakdown of gene cat
df %>% select(core_cat,Bxy) %>%
pivot_longer(Bxy, names_to = 'isolate') %>%
group_by(core_cat,isolate) %>%
summarise_all(sum) %>%
as.data.frame() %>%
ggplot(aes(x=core_cat,y=value)) +
geom_boxplot(aes(fill=core_cat)) + theme_bw()
df2 <- df %>% select(core_cat,cog1,Bxy) %>%
pivot_longer(Bxy, names_to = 'isolate') %>%
group_by(core_cat,cog1,isolate) %>%
summarise_all(sum) %>%
as.data.frame()
ggplot(df2, aes(x=cog1,y=value)) +
geom_boxplot(aes(fill=cog1)) + theme_bw()
ggplot(df2, aes(x=cog1,y=value)) +
geom_boxplot(aes(fill=core_cat)) + theme_bw()
df3 <- df2 %>%
pivot_wider(names_from = core_cat, values_from = value, values_fill = 0)
df4 <- df3 %>% select(-cog1) %>%
group_by(isolate) %>%
summarise_all(sum) %>%
rename(cloud_total=cloud,shell_total=shell,core_total=core)
df5 <- df3 %>%
left_join(df4,by='isolate') %>%
mutate(cloud_prop = cloud/cloud_total,
shell_prop = shell/shell_total,
core_prop = core/core_total)
df6 <- df5 %>%
select(cog1,isolate,cloud_prop,shell_prop,core_prop) %>%
pivot_longer(cols = c(cloud_prop,shell_prop,core_prop),names_to='core_cat')
ggplot(df6, aes(x=cog1,y=value)) +
geom_boxplot(aes(fill=core_cat)) + theme_bw()
core_vs_shell <- df6 %>%
group_by(cog1) %>%
filter(core_cat %in% c('shell_prop','core_prop')) %>%
rstatix::kruskal_test(value ~ core_cat) %>%
mutate(comp='core_vs_shell')
cloud_vs_shell <- df6 %>%
group_by(cog1) %>%
filter(core_cat %in% c('shell_prop','cloud_prop')) %>%
rstatix::kruskal_test(value ~ core_cat) %>%
mutate(comp='cloud_vs_shell')
cloud_vs_core <- df6 %>%
group_by(cog1) %>%
filter(core_cat %in% c('core_prop','cloud_prop')) %>%
rstatix::kruskal_test(value ~ core_cat) %>%
mutate(comp='cloud_vs_core')
kw <- core_vs_shell %>%
add_row(cloud_vs_shell) %>%
add_row(cloud_vs_core) %>%
filter(p!='NaN')
kw <- kw %>%
mutate(p_adj = p * nrow(kw))
df <- as.data.frame(header) %>%
separate(header,sep=' ',into=c('Gene.ID','Gene')) %>%
left_join(eggnog, by = c('Gene.ID' = 'query')) %>%
left_join(pres_abs,by='Gene') %>%
select(-Bov)
head(df)
## Gene.ID Gene best_OG_cat best_OG_name
## 1 HFONCOPK_00001 group_13236 <NA> <NA>
## 2 HFONCOPK_00002 group_9967 <NA> <NA>
## 3 HFONCOPK_00003 degA K 4NM06@976|Bacteroidetes
## 4 HFONCOPK_00004 rbsK H 4NENQ@976|Bacteroidetes
## 5 HFONCOPK_00005 group_2728 P 4NDXS@976|Bacteroidetes
## 6 HFONCOPK_00006 susD_3 M 4NE4Y@976|Bacteroidetes
## best_OG_desc
## 1 <NA>
## 2 <NA>
## 3 helix_turn _helix lactose operon repressor
## 4 Catalyzes the phosphorylation of ribose at O-5 in a reaction requiring ATP and magnesium. The resulting D-ribose-5- phosphate can then be used either for sythesis of nucleotides, histidine, and tryptophan, or as a component of the pentose phosphate pathway
## 5 TonB-linked outer membrane protein, SusC RagA family
## 6 RagB, SusD
## Annotation P17.C2 P17.A3 P17.D4 P17.D5 P19.A7
## 1 hypothetical protein 0 0 0 0 0
## 2 hypothetical protein 0 0 0 0 0
## 3 HTH-type transcriptional regulator DegA 0 0 0 0 0
## 4 Ribokinase 0 0 0 0 0
## 5 TonB-dependent receptor SusC 0 0 0 0 0
## 6 hypothetical protein 0 0 0 0 0
## P19.G7 P19.D7.or.C7 P19.10A P19.10B P19.10E P20.6B P20.6G P21.2G P21.4E
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## P21.4F P21.4G P21.6D P21.6E P21.6G P21.11A P21.11C P13.H2 P13.F7 P13.A9
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## P13.G9 P13.H9 P14.C1 P14.A2 P14.G2 P14.A4 P14.B4 P14.E4 P14.F5 P14.B8 P15.A10
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## P15.B10 P15.E10 P15.F10 P15.F11 GCA.015551805.1.ASM1555180v1
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 1
## 5 0 0 0 0 1
## 6 0 0 0 0 1
## GCA.006546965.1.ASM654696v1 GCA.008710235.1.ASM871023v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102765.1.ASM910276v1 GCA.009102605.1.ASM910260v1
## 1 0 0
## 2 0 1
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102705.1.ASM910270v1 GCA.009102805.1.ASM910280v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.002161135.1.ASM216113v1 GCA.009102755.1.ASM910275v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009102675.1.ASM910267v1 GCA.009102825.1.ASM910282v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.003469745.1.ASM346974v1 GCA.004167295.1.ASM416729v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009093775.1.ASM909377v1 GCA.003436085.1.ASM343608v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.009102685.1.ASM910268v1 GCA.009102845.1.ASM910284v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102085.1.ASM910208v1 GCA.009102665.1.ASM910266v1
## 1 0 0
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009101945.1.ASM910194v1 GCA.009102725.1.ASM910272v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009148965.1.ASM914896v1 GCA.009095545.1.ASM909554v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.003437545.1.ASM343754v1 GCA.003474645.1.ASM347464v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.009102015.1.ASM910201v1 GCA.003474245.1.ASM347424v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.003468875.1.ASM346887v1 GCA.015553655.1.ASM1555365v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009102045.1.ASM910204v1 GCA.009102025.1.ASM910202v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.015668785.1.ASM1566878v1 GCA.002161115.1.ASM216111v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.009102075.1.ASM910207v1 GCA.009093695.1.ASM909369v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## GCA.015559535.1.ASM1555953v1 GCA.009101535.1.ASM910153v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.015547545.1.ASM1554754v1 GCA.003464445.1.ASM346444v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101785.1.ASM910178v1 GCA.009101565.1.ASM910156v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102225.1.ASM910222v1 GCA.009102105.1.ASM910210v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101635.1.ASM910163v1 GCA.009102625.1.ASM910262v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101555.1.ASM910155v1 GCA.009153275.1.ASM915327v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102465.1.ASM910246v1 GCA.009102135.1.ASM910213v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102505.1.ASM910250v1 GCA.009093945.1.ASM909394v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102325.1.ASM910232v1 GCA.009101725.1.ASM910172v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009103215.1.ASM910321v1 GCA.009102525.1.ASM910252v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102305.1.ASM910230v1 GCA.015556195.1.ASM1555619v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 0 1
## 6 1 1
## GCA.009101615.1.ASM910161v1 GCA.009101775.1.ASM910177v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102485.1.ASM910248v1 GCA.009102185.1.ASM910218v1
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009153235.1.ASM915323v1 GCA.009101745.1.ASM910174v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009101985.1.ASM910198v1 GCA.009102215.1.ASM910221v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102385.1.ASM910238v1 GCA.009101685.1.ASM910168v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 1
## 5 0 1
## 6 1 1
## GCA.009093835.1.ASM909383v1 GCA.009102545.1.ASM910254v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 1 1
## 6 1 1
## GCA.009102165.1.ASM910216v1 GCA.009102245.1.ASM910224v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## GCA.009102285.1.ASM910228v1 GCA.009102785.1.ASM910278v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 1 1
## GCA.009101505.1.ASM910150v1 GCA.009102145.1.ASM910214v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 1
## 5 0 0
## 6 1 1
## GCA.009101625.1.ASM910162v1 GCA.009101665.1.ASM910166v1
## 1 0 0
## 2 0 0
## 3 1 1
## 4 1 1
## 5 0 1
## 6 1 1
## GCA.009102575.1.ASM910257v1 GCA.009102265.1.ASM910226v1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 1
## 5 0 0
## 6 1 1
## GCA.009102375.1.ASM910237v1 GCA.009101705.1.ASM910170v1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 1
## GCA.009101965.1.ASM910196v1 GCA.009101645.1.ASM910164v1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 1
## GCA.003458755.1.ASM345875v1 GCA.003473975.1.ASM347397v1
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
df$total <- df %>%
select(Bxy) %>%
mutate_all(as.numeric) %>%
rowSums()
df <- df %>% filter(total > 0) #remove genes not present after excluding Bov
core = length(Bxy)* .95 #set limits for core, shell, cloud genes
shell = length(Bxy)* .15
df <- df %>% mutate(core_cat = if_else(total>core,'core','shell'),
cog1 =str_sub(best_OG_cat, end=1),
best_OG_cat = replace_na(best_OG_cat,'-'))
df_sh <- df %>% select(-Bxy)
#average breakdown of gene cat
df %>% select(core_cat,Bxy) %>%
pivot_longer(Bxy, names_to = 'isolate') %>%
group_by(core_cat,isolate) %>%
summarise_all(sum) %>%
as.data.frame() %>%
ggplot(aes(x=core_cat,y=value)) +
geom_boxplot(aes(fill=core_cat)) + theme_bw()
df2 <- df %>% select(core_cat,cog1,Bxy) %>%
pivot_longer(Bxy, names_to = 'isolate') %>%
group_by(core_cat,cog1,isolate) %>%
summarise_all(sum) %>%
as.data.frame()
ggplot(df2, aes(x=cog1,y=value)) +
geom_boxplot(aes(fill=cog1)) + theme_bw()
ggplot(df2, aes(x=cog1,y=value)) +
geom_boxplot(aes(fill=core_cat)) + theme_bw()
df3 <- df2 %>%
pivot_wider(names_from = core_cat, values_from = value, values_fill = 0)
df4 <- df3 %>% select(-cog1) %>%
group_by(isolate) %>%
summarise_all(sum) %>%
rename(shell_total=shell,core_total=core)
df5 <- df3 %>%
left_join(df4,by='isolate') %>%
mutate(shell_prop = shell/shell_total,
core_prop = core/core_total)
shell_cog_prop <- df5 %>%
select(cog1,isolate,shell_prop,core_prop) %>%
pivot_longer(cols = c(shell_prop,core_prop),names_to='core_cat')
ggplot(shell_cog_prop, aes(x=cog1,y=value)) +
geom_boxplot(aes(fill=core_cat)) + theme_bw()
core_vs_shell <- shell_cog_prop %>%
group_by(cog1) %>%
filter(core_cat %in% c('shell_prop','core_prop')) %>%
rstatix::kruskal_test(value ~ core_cat) %>%
mutate(comp='core_vs_shell')
cloud_vs_core <- cloud_vs_core %>%
mutate(p_adj = p * nrow(kw))
Bxy_tree_count <- Bxy_tree
Bxy_tree_count <- makeNodeLabel(Bxy_tree_count)
write.tree(Bxy_tree_count,file=file.path(dir,'count/Bxy_count.tree'))
format_pres_abs_table <- function(pres_abs_table,output){
pres_abs <- read.csv(pres_abs_table)
colnames(pres_abs) <- str_replace_all(colnames(pres_abs),'[_-]','.')
isblank <- function(x) (ifelse(x=="",0,1))
pres_abs <- pres_abs %>%
select(Gene,Bxy_metadata$isolate) %>%
mutate_at(vars(-Gene),isblank)
write.table(pres_abs,file=paste0(output,'_table.txt'),
sep='\t',quote=FALSE,row.names = FALSE)
pres_abs_no_orfans <- pres_abs %>%
mutate(sum = rowSums(across(where(is.numeric)))) %>%
filter(sum>1) %>%
select(-sum)
dim(pres_abs_no_orfans)
write.table(pres_abs_no_orfans,file=paste0(output,'_table_no_orfans.txt'),
sep='\t',quote=FALSE,row.names = FALSE)
}
format_pres_abs_table(pres_abs_table=file.path(dir,'roary/gene_presence_absence.csv'),
output=file.path(dir,'count/roary'))
runCount <- function(tree,data,output,gain) {
system(paste('java -Xmx2048M -cp bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain ',
gain,
tree,
data,'>',
output,sep=' '))
system(paste0("grep '# CHANGE' ",output," | sed 's/# //' > ",output,".CHANGE"))
system(paste0("grep '# PRESENT' ",output," | sed 's/# //' > ",output,".PRESENT"))
present <- read.table(paste0(output,".PRESENT"),sep='\t',header=TRUE)
res <- present %>%
mutate(istip = if_else(node %in% metadata$isolate,'tip','node')) %>%
rstatix::kruskal_test(genes ~ istip)
res$gain_penalty <- gain
return(res)
}
#compare gain penalty
gain2 <-runCount(tree=file.path(dir,'count/Bxy_count.tree'),
data=file.path(dir,'count/roary_table.txt'),
output=file.path(dir,'count/roary_table'),
gain=2)
gain2 <-runCount(tree=file.path(dir,'count/Bxy_count.tree'),
data=file.path(dir,'count/roary_table_no_orfans.txt'),
output=file.path(dir,'count/roary_table_no_orfans'),
gain=2)
count_output <- read.table(file.path(dir,'count/roary_table_no_orfans.CHANGE'),sep='\t',header=TRUE)
count_tree <- read.tree(file=file.path(dir,'count/Bxy_count.tree'))
#read in count output
count_output <- as.data.frame(count_output) %>%
select(-CHANGE) %>%
rename(label=node) %>%
select(label,family_gain,family_loss,gene_gain,gene_loss) %>%
droplevels.data.frame()
#add count gene gain loss info to tree_object
tree_dat = count_tree %>%
treeio::as_tibble() %>%
left_join(count_output,by='label') %>%
left_join(metadata,by=c('label'='isolate')) %>%
as.treedata()
extract_subtree_treeio <- function(tree_S4,taxa) {
MCRA <- ape::getMRCA(as.phylo(tree_S4),taxa)
subtree <- extract.clade(as.phylo(tree_S4),MCRA)
to_drop <- setdiff(as.phylo(tree_S4)$tip.label,subtree$tip.label)
tree_S4_subtree <- treeio::drop.tip(tree_S4,to_drop)
return(tree_S4_subtree)
}
count_cladeA_tree <- extract_subtree_treeio(tree_dat,c('GCA.009102525.1.ASM910252v1','GCA.009093695.1.ASM909369v1'))
plot_family_gain <- function(tree,title,limit){
ggtree(tree) +
geom_text2(aes(x=branch, label=family_gain, subset=(as.numeric(family_gain)>limit|as.numeric(family_loss)>limit)),
size =3, vjust=-.1, color='firebrick') +
geom_text2(aes(x=branch, label=family_loss,subset=(as.numeric(family_gain)>limit|as.numeric(family_loss)>limit)),
size=3, vjust=1, color='blue') +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(as.phylo(tree)$tip.label)) +
geom_treescale() + ggtitle(title)
}
famgain <- plot_family_gain(count_cladeA_tree,'cladeA',100)
plot(famgain)
plot_gene_gain <- function(tree,title,limit){
ggtree(tree) +
geom_text2(aes(x=branch, label=gene_gain,subset=(as.numeric(gene_gain)>limit|as.numeric(gene_loss)>limit)),
size =3, vjust=-.1, color='firebrick') +
geom_text2(aes(x=branch, label=gene_loss,subset=(as.numeric(gene_gain)>limit|as.numeric(gene_loss)>limit)),
size=3, vjust=1, color='blue') +
geom_tippoint(aes(color=host_site), alpha=0.8) +
scale_colour_manual(values=get_color_palette(as.phylo(tree)$tip.label)) +
geom_treescale() + ggtitle(title)
}
plot_gene_gain(count_cladeA_tree,'cladeA',100)
visualize_count <- function(count_output_file,tree_file,output_prefix){
count_output = read.table(count_output_file,sep='\t',header=TRUE)
count_tree = read.tree(file=tree_file)
#read in count output
count_output <- as.data.frame(count_output) %>%
select(-CHANGE) %>%
rename(label=node) %>%
select(label,family_gain,family_loss,gene_gain,gene_loss) %>%
droplevels.data.frame()
#add count gene gain loss info to tree_object
tree_dat = count_tree %>%
treeio::as_tibble() %>%
left_join(count_output,by='label') %>%
left_join(metadata,by=c('label'='isolate')) %>%
as.treedata()
count_cladeA_tree <- extract_subtree_treeio(tree_dat,c('GCA.009102525.1.ASM910252v1','GCA.009093695.1.ASM909369v1'))
count_cladeA_tree_plot <- plot_family_gain(count_cladeA_tree,'cladeA',100)
ggsave(count_cladeA_tree_plot,file=paste0(output_prefix,'_family_cladeA.pdf'),height=10)
count_cladeA_tree_plot <- plot_gene_gain(count_cladeA_tree,'cladeA',100)
print(count_cladeA_tree_plot)
ggsave(count_cladeA_tree_plot,file=paste0(output_prefix,'_gene_cladeA.pdf'),height=10)
count_cladeB_tree <- extract_subtree_treeio(tree_dat,c('P17.D4','GCA.003464445.1.ASM346444v1'))
count_cladeB_tree_plot <- plot_family_gain(count_cladeB_tree,'cladeB',100)
ggsave(count_cladeB_tree_plot,file=paste0(output_prefix,'_family_cladeB.pdf'),height=10)
count_cladeB_tree_plot <- plot_gene_gain(count_cladeB_tree,'cladeB',100)
print(count_cladeB_tree_plot)
ggsave(count_cladeB_tree_plot,file=paste0(output_prefix,'_gene_cladeB.pdf'),height=10)
count_cladeC_tree <- extract_subtree_treeio(tree_dat,c('P21.4E','GCA.009102685.1.ASM910268v1'))
count_cladeC_tree_plot <- plot_family_gain(count_cladeC_tree,'cladeC',100)
ggsave(count_cladeC_tree_plot,file=paste0(output_prefix,'_family_cladeC.pdf'),height=10)
count_cladeC_tree_plot <- plot_gene_gain(count_cladeC_tree,'cladeC',100)
print(count_cladeC_tree_plot)
ggsave(count_cladeC_tree_plot,file=paste0(output_prefix,'_gene_cladeC.pdf'),height=10)
count_captive1Node <- extract_subtree_treeio(tree_dat,c('P17.D4','P13.A9'))
count_captive1Node_plot <- plot_family_gain(count_captive1Node,'Mixed host captive clade',0)
ggsave(count_captive1Node_plot,file=paste0(output_prefix,'_family_captive1.pdf'),height=10)
count_captive1Node_plot <- plot_gene_gain(count_captive1Node,'Mixed host captive clade',0)
print(count_captive1Node_plot)
ggsave(count_captive1Node_plot,file=paste0(output_prefix,'_gene_captive1.pdf'),height=10)
count_captive2Node <- extract_subtree_treeio(tree_dat,c('P21.4E','P21.11C'))
count_captive2Node_plot <- plot_family_gain(count_captive2Node,'Mixed host captive clade',0)
ggsave(count_captive2Node_plot,file=paste0(output_prefix,'_family_captive2.pdf'),height=10)
count_captive2Node_plot <- plot_gene_gain(count_captive2Node,'Mixed host captive clade',0)
print(count_captive2Node_plot)
ggsave(count_captive2Node_plot,file=paste0(output_prefix,'_gene_captive2.pdf'),height=10)
count_captive3Node <- extract_subtree_treeio(tree_dat,c('P21.6E','P21.2G'))
count_captive3Node_plot <- plot_family_gain(count_captive3Node,'Mixed host captive clade',0)
ggsave(count_captive3Node_plot,file=paste0(output_prefix,'_family_captive3.pdf'),height=10)
count_captive3Node_plot <- plot_gene_gain(count_captive3Node,'Mixed host captive clade',0)
print(count_captive3Node_plot)
ggsave(count_captive3Node_plot,file=paste0(output_prefix,'_gene_captive3.pdf'),height=10)
}
visualize_count(count_output_file = file.path(dir,'count/roary_table.CHANGE'),
tree_file = file.path(dir,'count/Bxy_count.tree'),
output_prefix =file.path(dir,'count/roary_table'))
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
visualize_count(count_output_file = file.path(dir,'count/roary_table_no_orfans.CHANGE'),
tree_file = file.path(dir,'count/Bxy_count.tree'),
output_prefix =file.path(dir,'count/roary_table_no_orfans'))
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
of_tab <- read_tsv(file=
file.path(dir,'orthofinder/Results_Mar23/Orthogroups/Orthogroups.GeneCount.tsv'))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Orthogroup = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
colnames(of_tab) <- str_replace_all(colnames(of_tab),'[-_]','.')
of_tab <- of_tab %>%
mutate(sum = rowSums(select(of_tab, -Orthogroup) > 0))
of_tab <- of_tab %>% filter(sum>0)
write.table(of_tab,file=file.path(dir,'count/orthofinder_table.txt'),
sep='\t',quote=FALSE,row.names = FALSE)
of_tab_no_orfans <- of_tab %>% filter(sum>1) #remove orfans
write.table(of_tab_no_orfans,file=file.path(dir,'count/orthofinder_table_no_orfans.txt'),
sep='\t',quote=FALSE,row.names = FALSE)
runCount(tree=file.path(dir,'count/Bxy_count.tree'),
data=file.path(dir,'count/orthofinder_table.txt'),
output=file.path(dir,'count/orthofinder_table'),
gain=2)
## # A tibble: 1 x 7
## .y. n statistic df p method gain_penalty
## <chr> <int> <dbl> <int> <dbl> <chr> <dbl>
## 1 genes 253 2.97 1 0.0848 Kruskal-Wallis 2
runCount(tree=file.path(dir,'count/Bxy_count.tree'),
data=file.path(dir,'count/orthofinder_table_no_orfans.txt'),
output=file.path(dir,'count/orthofinder_table_no_orfans'),
gain=2)
## # A tibble: 1 x 7
## .y. n statistic df p method gain_penalty
## <chr> <int> <dbl> <int> <dbl> <chr> <dbl>
## 1 genes 253 2.97 1 0.0848 Kruskal-Wallis 2
visualize_count(count_output_file = file.path(dir,'count/orthofinder_table.CHANGE'),
tree_file = file.path(dir,'count/Bxy_count.tree'),
output_prefix =file.path(dir,'count/orthofinder_table'))
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
visualize_count(count_output_file = file.path(dir,'count/orthofinder_table_no_orfans.CHANGE'),
tree_file = file.path(dir,'count/Bxy_count.tree'),
output_prefix =file.path(dir,'count/orthofinder_table_no_orfans'))
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
format_pres_abs_table(pres_abs_table=file.path(dir,'roary_nosplitparalogs/gene_presence_absence.csv'),
output=file.path(dir,'count/roary_nsp'))
runCount(tree=file.path(dir,'count/Bxy_count.tree'),
data=file.path(dir,'count/roary_nsp_table.txt'),
output=file.path(dir,'count/roary_nsp_table'),
gain=2)
## # A tibble: 1 x 7
## .y. n statistic df p method gain_penalty
## <chr> <int> <dbl> <int> <dbl> <chr> <dbl>
## 1 genes 253 0.104 1 0.747 Kruskal-Wallis 2
runCount(tree=file.path(dir,'count/Bxy_count.tree'),
data=file.path(dir,'count/roary_nsp_table_no_orfans.txt'),
output=file.path(dir,'count/roary_nsp_table_no_orfans'),
gain=2)
## # A tibble: 1 x 7
## .y. n statistic df p method gain_penalty
## <chr> <int> <dbl> <int> <dbl> <chr> <dbl>
## 1 genes 253 1.27 1 0.259 Kruskal-Wallis 2
visualize_count(count_output_file = file.path(dir,'count/roary_nsp_table.CHANGE'),
tree_file = file.path(dir,'count/Bxy_count.tree'),
output_prefix =file.path(dir,'count/roary_nsp_table'))
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
visualize_count(count_output_file = file.path(dir,'count/roary_nsp_table_no_orfans.CHANGE'),
tree_file = file.path(dir,'count/Bxy_count.tree'),
output_prefix =file.path(dir,'count/roary_nsp_table_no_orfans'))
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image
## Saving 7 x 10 in image